Kaggle Submission_Final(Python)

Loading...
import nltk
import re
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.sql.functions import *
from nltk.stem import WordNetLemmatizer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from sparknlp.annotator import Tokenizer, Normalizer, StopWordsCleaner, Stemmer
from pyspark.sql.types import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
from pyspark.ml import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, NGram, VectorAssembler, Word2Vec
from pyspark.sql.window import Window
from pyspark.ml.feature import *
from nltk.stem import SnowballStemmer

# nltk.download('all')
# Load in one of the tables
df = spark.sql("select * from default.reviews_train").sample(0.25, seed=47)
print((df.count(), len(df.columns)))
(785188, 11)
df.printSchema()
root |-- reviewID: integer (nullable = true) |-- overall: double (nullable = true) |-- verified: boolean (nullable = true) |-- reviewTime: string (nullable = true) |-- reviewerID: string (nullable = true) |-- asin: string (nullable = true) |-- reviewerName: string (nullable = true) |-- reviewText: string (nullable = true) |-- summary: string (nullable = true) |-- unixReviewTime: integer (nullable = true) |-- label: integer (nullable = true)
# Let's look at some quick summary statistics
display(df.withColumnRenamed('summary', 'summaryCol').describe())
 
summary
reviewID
overall
reviewTime
reviewerID
asin
reviewerName
reviewText
1
2
3
4
5
count
785188
785188
785188
785188
785188
785130
785188
mean
1569147.717898134
4.318688008476951
null
null
4.393540211092187E7
NaN
2.8605570603571427E8
stddev
906429.9981685955
1.1248134204859315
null
null
2.6737754265206018E8
NaN
1.5136639896087563E9
min
4
1.0
01 1, 1998
A0001528BGUBOEVR6T5U
0000913154
max
3138708
5.0
12 9, 2017
AZZZYAYJQSDOJ
B01HIZF7XE
~~Trish~~
~~~~~ALL GREAT WORKS by DICKENS AND OTHER WONDERFUULL WRITERS ARE AVAILABLE "FREE" FOM AMAZON.~~~~ >>> SEARCH FREE KINCLE BOOKS<<< THANX TO YOU AMAZON @<~~<~~~~~ DAWN MARIE
5 rows
# The count of each overall rating
display(df.groupBy("overall").count().orderBy(col("overall").asc()))
 
overall
count
1
2
3
4
5
1
40235
2
34730
3
65869
4
138090
5
506264
5 rows
# The most common product IDs
display(df.groupBy("asin").count().orderBy(col("count").desc()))
 
asin
count
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
0007420412
4469
000711835X
4139
0007548672
3632
0007350899
1779
0007444117
1770
B000YGEVMI
1640
0007378033
1422
B0015TMHSI
1317
006017322X
1202
0007384289
1143
0007350783
1085
0002247399
1077
0007155662
1055
B0014CX87U
1031
0007141424
1012
0008220565
951
0007271239
914
10,000 rows|Truncated data
def date_time_extraction(df):

    # Assuming df is your DataFrame and reviewTime is the column name
    df = df.withColumn("reviewDate", to_date(df.reviewTime, "MM dd, yyyy"))

    # Extract the month, day, and year into new columns
    df = df.withColumn("reviewMonth", month("reviewDate"))
    df = df.withColumn("reviewDay", dayofmonth("reviewDate"))
    df = df.withColumn("reviewYear", year("reviewDate"))
    df = df.withColumn('reviewdayOfWeek', dayofweek(col('reviewDate')))
    df = df.withColumn("reviewYearDate", to_date(col("reviewYear"), "yyyy"))
    df = df.withColumn("verified", when(col("verified") == True, 1).otherwise(0))

    return df
df = date_time_extraction(df)
def review_counts_per_customer(data_input):
    
    # Finding the total number of reviews a customer left in their lifetime and the unique products reviewed

    review_counts = data_input.groupBy("reviewerID").agg(
        countDistinct('reviewID').alias('reviews_per_customer'),
        countDistinct('asin').alias('products_reviewed_per_customer'))
    
    data_input = data_input.join(review_counts, on="reviewerID", how="inner")

    return data_input
df = review_counts_per_customer(df)
def review_counts_per_product(data_input):
    
    # Finding the total number of reviews a product generated in its lifetime

    review_counts = data_input.groupBy("asin").agg(
        countDistinct("reviewID").alias("reviews_per_product"),
        countDistinct('reviewerID').alias("reviewers_per_product"),
        min('reviewDate').alias('product_earliest_review'),
        max('reviewDate').alias('product_latest_review'))
        
    data_input = data_input.join(review_counts, on="asin", how="inner")
    
    data_input = data_input.withColumn('product_review_interval', datediff(data_input['product_latest_review'], data_input['product_earliest_review']))

    data_input = data_input.withColumn("product_review_interval", when(col("product_latest_review") == col("product_earliest_review"), 0).otherwise(col("product_review_interval")))
    
    data_input = data_input.withColumn('product_earliest_review', datediff(current_date(), data_input['product_earliest_review']))
    
    data_input = data_input.withColumn('product_latest_review', datediff(current_date(), data_input['product_latest_review']))
        
    return data_input
df = review_counts_per_product(df)
display(df)
 
asin
reviewerID
reviewID
overall
verified
reviewTime
reviewerName
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
B0000665TD
A10173YNR68M0T
291238
5
true
09 7, 2016
John Kirschbaum
B0001WN0MW
A10DIDVYDN2Q5Z
3112636
5
false
08 27, 2004
M. Alterio
B0011TRO80
A10NPRFXUCUVME
2168704
5
true
07 29, 2015
Nick Ouwerkerk
B00BMFIXT2
A10YMY9ZUFBL1P
2965813
5
true
12 7, 2013
Don&#039;t Be a Troll
B00JDP1AWU
A11AXAQ59CWZZV
3014576
4
true
03 12, 2017
Jeremy
B0000665TD
A12K29KCD2VNDS
293062
5
true
08 16, 2013
Albino Monkey
B001F51A4Y
A12KPDAG1IOC4D
2438571
5
true
04 17, 2012
J &amp; K Martin
B0011TRO80
A12MKYS4WUKGV9
1918870
4
true
10 29, 2013
Matt Fay
B0015HZLVA
A12Q992TYCVB4Y
364758
5
true
07 30, 2014
SaeDus
B000X6K9J8
A138CEXJ9ZOB7C
2121691
5
true
04 15, 2016
Marla Nine
B000X6K9J8
A13DZ9YC2TV9W1
2122194
5
true
02 20, 2014
J. Kowalik
B000X6K9J8
A13FEQBTOWU10D
1824804
5
true
09 6, 2015
Sassy Pantalones
0007391625
A13H5US9Z7L9M7
644090
2
false
10 7, 2012
Just kath
B000DZD1AS
A13ICLXE5CDMZL
1037383
4
true
05 28, 2016
Martha J. Canary
B006WQR3GA
A13IEAQ1T6PT98
2939767
4
false
10 24, 2012
Brian Dowrick
B00BMFIXT2
A13VJ63EJR3RE1
2964863
5
true
10 23, 2015
Lauren A.
B000OH5HMK
A13VRD542WGVGK
1528677
1
false
06 6, 2007
Ruth A. CARRAWAY
3,440 rows|Truncated data
# Filtering out rows 

rows_with_missing = df.filter(col("reviewText").isNull() | col("summary").isNull() | col("label").isNull())
display(rows_with_missing)
 
asin
reviewerID
reviewID
overall
verified
reviewTime
reviewerName
reviewText
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
B0000DJUYR
A1BVBYEQLLYACZ
601702
5
1
12 10, 2017
Mike
Nice hangs in RV fridge
0060563567
A3V799ATZNRHGQ
1435466
3
1
04 18, 2017
karen driskell
Difficult to follow
B00KAED850
A1EKJBLI9211S8
3020414
5
1
06 16, 2016
Manuel Castellanos
Nice open world game nice graphics and playability
B013H0IRO0
A1U86MCXJLE57H
3074829
5
0
12 2, 2016
Andre
Pretty good game! I love it
0007581270
A2NY7PKG4Q0IQB
889222
1
0
07 29, 2015
#awesome
This was not an easy thing you need a great deal on this one day you want it is the
B00006IFR9
AY3433RWKL094
320071
5
1
09 20, 2016
Sherri D
Love Fiesta Ware
B000N4WN08
AGZ524TKNNUO5
1467477
5
1
11 21, 2016
Carina P
It was get burnt for the first use !
B003N3IF30
A3DTD6XD66H0UY
2899014
5
1
06 16, 2015
Mrs. Hernandez
Nice idea, works as expected.
B006VB2UO6
AMX6BHZCRIRJZ
2939073
1
1
07 17, 2012
Amazon User A113
This game is more worthless than the HD-DVD players when they came out. If you want a paper weight...go for it. If you want a college football game, pass on it. The game freezes before dynasty games...I have restarted my system over 10 times now hoping it will work....NOPE, every time in Dynasty when you get to the load screen it freezes. (Yes, I've tried installing it to the xBox then playing) As for the normal game play (any team vs any team), same as last year. Very minor updates to game ...
B0029DPM18
A2SXIZSTGVO7OF
2662860
5
1
04 1, 2017
flhr
Works good.
B00JK00S0S
A1EKJBLI9211S8
3015852
5
1
06 16, 2016
Manuel Castellanos
The incredible and survival history will keep you all the time waiting for something unexpected, good game super....
B00JK00S0S
A1FNHLDDJ8M2A
3015997
5
1
03 11, 2015
Joey
This is a must have game! I had it on PS3 & bought it again for PS4,the story is great along with the multiplayer. It's not call of duty where you can run around like The Ultimate Warrior beating down everyone in your path,it's a survival game where ammo & health is limited so you must work as a team. 5/5 stars
B00009R66F
AOGU17DLH0A4M
430598
5
1
08 8, 2013
will
the best overall the best machine I have ever used...Why it is just as easy as a vacuum t operate...The brushes really make this product... The hand held tools work real well also... I used Awesome dollar store cleaner and did the couch it came out like new...Great on the carpet....Very good price w/ delivery you cannot beat it...I have told every one how easy it is to use and clean too... If you have pets or kids this is the one machine for the hallway traffic as well ...ok so maby it might ...
0006625746
A1C3FVPZBJTJ2Z
338270
4
0
10 1, 2003
Vicki Peck
This book was wonderfully constructed & thought out and later became one of my favorite movies, starring Angelica Houston as the Grand High Witch and she plays it like no one else could! If you can't get ahold of a copy of this book, rent the movie!!!! Both are fun, a little scary and an outrageous romp. Enjoy!
B001MMSWGO
A8J2MRDWHDS9U
2517336
5
1
12 18, 2016
Aussianna
Fast delivery, everything I expected.
B0026IBEDG
A1O5HADXZG45FN
2627072
5
1
10 27, 2016
SheilaT
As expected for cotton
B00005KB37
A1I3DSZD65E4VD
208383
5
1
04 23, 2018
L.L.
I fry everything from pierogies to clam strips in my Frydaddy and I love it! You only need 4 cups of oil. I like to use Canola oil. Oil heats up very quickly. The manual says 15 minutes but it seems faster then that. The oil splatter is very minimal because of the nice high sides on the fryer. There is a magnetic cord that attaches easily. Its a perfect size to move around with ease and storage is simple! This Frydaddy gets 5 stars from me!
83 rows
# Find rows with empty strings in specified columns
empty_rows = df.filter(col("summary") == " ")

# Show the rows with empty strings
display(empty_rows)
Query returned no results
df = df.filter(col("summary") != " ")
print((df.count(), len(df.columns)))
(785105, 24)
def sentiment_analysis(data_input):

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import udf
    from pyspark.sql.types import StringType, StructType, StructField, FloatType
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    # Initialize the VADER sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    # Define a UDF to perform sentiment analysis
    def analyze_sentiment(text):
        sentiment = analyzer.polarity_scores(text)
        return sentiment["compound"]

    # Register the UDF with FloatType return type
    sentiment_udf = udf(analyze_sentiment, FloatType())

    # Apply the sentiment analysis UDF to the DataFrame
    data_input = data_input.withColumn("sentiment_score", sentiment_udf(data_input["reviewText"]))

    # Show the result
    return data_input
def review_level_features(df):
    
    # Calculating the length of every review - negative reviews tend to be longer
    
    df = df.withColumn("Words", split(df["reviewText"], " "))
    df = df.withColumn("Review_Length", size(df["Words"]))
 
    # Bucketing reviews into high and low
 
    condition = (col("overall") > 2)
    df = df.withColumn("overall_high", when(condition, 1).otherwise(0))
 
    # Bucketing customer reviews into positive & negative / neutral experiences
 
    df = sentiment_analysis(df)
    sentiment_condition = (col("sentiment_score") > 0)
    df = df.withColumn("positive_experience", when(sentiment_condition, 1).otherwise(0))
 
    neutral_exp_condition = (col("sentiment_score") == 0)
    df = df.withColumn("neutral_experience", when(neutral_exp_condition, 1).otherwise(0))
 
    # Do customers review only if there is a problem? Or do they always review in general? 
 
    result = df.groupBy("reviewerID").agg(
        avg("sentiment_score").alias("average_sentiment_score_per_customer"),
        percentile_approx("sentiment_score", 0.5).alias("median_sentiment_score_per_customer"))
    
    negative_condition = (col("average_sentiment_score_per_customer") < 0)
    result = result.withColumn("negative_exp_reviewer", when(negative_condition, 1).otherwise(0))
    df = df.join(result, on="reviewerID", how="left")
 
    # General sentiment of a product
 
    sentiment_product = df.groupBy("asin").agg(
        avg("sentiment_score").alias("avg_sentiment_score_per_product"),
        percentile_approx("sentiment_score", 0.5).alias("median_sentiment_score_per_product")
    )
 
    df = df.join(sentiment_product, on="asin", how="left")
    df = df.withColumn("review_divergence", ((col("sentiment_score") - col("avg_sentiment_score_per_product")) / col("avg_sentiment_score_per_product")))
 
    df = df.withColumn("review_divergence", when(col("review_divergence").isNull(), 0).otherwise(col("review_divergence")))
 
    columns_to_drop = [
        # "average_sentiment_score_per_customer",
                       "sentiment_score"]
 
    df = df.drop(*columns_to_drop)
 
    return df
df = review_level_features(df)
def customer_level_date_time_features(df):

    # Calculate the number of days since review creation - idea is to find out how recent a review is. More recent reviews are more relevant than older ones
    
    df = df.withColumn("days_since_review_creation", datediff(current_date(), col("reviewDate")))

    df = df.withColumn("months_since_review_creation", col("days_since_review_creation") / 30)

    # Finding the first & last review of each reviewer
    
    result = df.groupBy("reviewerID").agg(
        min("reviewDate").alias("first_review_date"), 
        max("reviewDate").alias("last_review_date"))

    df = df.join(result, on="reviewerID", how="inner")

    # Finding the number of reviews the customer leaves per year

    df = df.withColumn("reviews_per_year", col("reviews_per_customer") / (datediff(col("last_review_date"), col("first_review_date")) / 365))

    # Shows how long the account was active for

    df = df.withColumn("months_since_customers_first_review", datediff(current_date(), col("first_review_date")) / 30)

    # Shows when the most recent review happened

    df = df.withColumn("months_since_customers_last_review", datediff(current_date(), col("last_review_date")) / 30)

    # df = df.withColumn("reviews_per_year", when(col("first_review_date") == col("last_review_date"), 1).otherwise(col("reviews_per_year")))

    df = df.withColumn("reviews_per_year_per_customer", when(col("first_review_date") == col("last_review_date"), 1).otherwise(col("reviews_per_year")))

    columns_to_drop = ["reviewYearDate",
                       "last_review_date",
                       "first_review_date", 
                       "reviews_per_year"
                    ]

    df = df.drop(*columns_to_drop)

    return df
df = customer_level_date_time_features(df)
agg_df = df.filter(col("label") == 1).groupBy("reviewMonth").agg(count('*').alias("review_count"))

# Step 4: Convert the PySpark DataFrame to a Pandas DataFrame
pandas_df = agg_df.toPandas()

# Step 5: Create the time series graph using Matplotlib
plt.figure(figsize=(12, 6))
plt.bar(pandas_df["reviewMonth"], pandas_df['review_count'], color='b')
plt.title('Customer Reviews Over Time')
plt.xlabel('Time')
plt.ylabel('Review Count')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

def org_level_features(df):

    condition = ((col("reviewMonth") >= 10) | (col("reviewMonth") == 1))
    df = df.withColumn("High_Volume_Timeframes", when(condition, 1).otherwise(0))

    columns_to_drop = ["reviewDate", 
                       "reviewMonth", "reviewDay", "reviewYear"
                    ]

    df = df.drop(*columns_to_drop)
    
    return df
df = org_level_features(df)
df.columns
Out[25]: ['reviewerID', 'asin', 'reviewID', 'overall', 'verified', 'reviewTime', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'label', 'reviewdayOfWeek', 'reviews_per_customer', 'products_reviewed_per_customer', 'reviews_per_product', 'reviewers_per_product', 'product_earliest_review', 'product_latest_review', 'product_review_interval', 'Words', 'Review_Length', 'overall_high', 'positive_experience', 'neutral_experience', 'average_sentiment_score_per_customer', 'median_sentiment_score_per_customer', 'negative_exp_reviewer', 'avg_sentiment_score_per_product', 'median_sentiment_score_per_product', 'review_divergence', 'days_since_review_creation', 'months_since_review_creation', 'months_since_customers_first_review', 'months_since_customers_last_review', 'reviews_per_year_per_customer', 'High_Volume_Timeframes']
def cleaning_review_data(df):

    stemmer = SnowballStemmer(language="english")

    def stem_text(text):
        return " ".join([stemmer.stem(word) for word in text.split()])

    stem_udf = udf(stem_text, StringType())

    stemmed_df = df.withColumn("stemmed_text", stem_udf(col("reviewText")))

    # convert text to all lower cases
    lower_df = stemmed_df.withColumn("lowered_text", lower(col("stemmed_text")))

    # remove punctuations
    cleaned_df = lower_df.withColumn("cleaned_text", regexp_replace(col("lowered_text"), "[^a-zA-Z\\s]", ""))

    # tokenize reviewText in order to filter out stop words
    tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="words")
    tokenized_df = tokenizer.transform(cleaned_df)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered_text")
    filtered_df = remover.transform(tokenized_df)

    df = filtered_df.withColumn('simplified_reviewText', concat_ws(' ', 'filtered_text'))

    df = df.withColumn('review_length_original_chars', length(df['reviewText']))
    df = df.withColumn('review_length_simplified_chars', length(df['simplified_reviewText']))

    return df
def detecting_non_verbal_cues(df):
    
    # UDF to count the number of question marks
    def count_question_marks(text):
        return text.count("?")

    # UDF to count the number of exclamation marks
    def count_exclamation_marks(text):
        return text.count("!")

    # UDF to count the number of all-capital words
    def count_all_caps(text):
        return len([word for word in text.split() if word.isupper()])

    # Register UDFs
    count_question_marks_udf = udf(count_question_marks, IntegerType())
    count_exclamation_marks_udf = udf(count_exclamation_marks, IntegerType())
    count_all_caps_udf = udf(count_all_caps, IntegerType())

    # Apply UDFs to DataFrame
    df = df.withColumn("question_marks_count", 
                       count_question_marks_udf("reviewText")) \
                        .withColumn("exclamation_marks_count", count_exclamation_marks_udf("reviewText")) \
                        .withColumn("all_caps_count", count_all_caps_udf("reviewText"))

    return df
df = detecting_non_verbal_cues(df)
df = cleaning_review_data(df)
drop_list = [
    'asin', 
    'reviewID', 
    'reviewerID', 
    'unixReviewTime',
    'reviewTime', 
    'image', 
    'style', 
    'reviewerName', 
    'overall', 
    'stemmed_text', 
    'lowered_text', 
    'cleaned_text', 
    'words', 
    'filtered_text']

df = df.select([column for column in df.columns if column not in drop_list])
print((df.count(), len(df.columns)))
(785105, 34)
display(df)
 
verified
reviewText
summary
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
true
This item does not fit across a double basin sink or normal 4 burner stove. It's way too small. It does make a nice inexpensive cutting board.
Too small...
true
It seems to be a good mouse but I have medium sized hands and it still feels kind of small. It feels really thin (like a apple mouse) and I guess I generally prefer a bigger more comfy mouse. That being said, it is accurate and has nice color scheme.
Kind of small
false
This game is fantastic. I can't believe I waited so long to get it. I didn't really like the other ones though. I have tried them all but for some reason I am addicted to this Fallout. I have spent at least 100+ hours so far in this game and only started playing it about 2 months ago. I beat it once as a Melee only character and then restarted as a charisma stealth player. WARNING: Do not complete the story entirely before completing missions in all 4 factions. If you DO complete it before you ...
I didn't like the other Fallout games
true
I really wanted to like this controller but there just isn't much of a reason to use it. There is only 1 benefit to using this over a Xbox 360 controller and that's for RTS games but even then it's terribly annoying. I would suggest just using a keyboard with built in trackpad and that would be better. It's just so.. ackward to use. Just imagine a Xbox controller that replaces one joystick with a circular trackpad from a laptop. That is exactly what this is and it just isn't good for much. For S...
Meh.. Use Xbox controller with Steam-Link
false
I got this game for 50 bucks. I think it was worth it. I am an avid star wars fan and I think this game is great. This is not a very competitive shooter. There is almost zero recoil on the blasters and killing each other is quite easy. There are only 4 maps for the main game mode, Walker Assault mode. If you are not a Star Wars fan, I would say this game would be about a 6/10 for you. However, as a Star Wars fan myself, I am absolutely loving this game. As a star wars fan, this is my review whic...
Non-Star Wars Fans: 3/5 ------- Star Wars Fans: 4/5
false
I bought this game at Target but felt like I should review on Amazon as well. Overall, I am disappointed in this game. Here is the pros and cons of this game (in my opinion). The cons could be pros for you. PROS: - Outstanding graphics - Solid 30 frames per second - Big world to explore - Character customization - Split Screen and local coop CONS: - Walk FOREVER before seeing any action - Clunky user interface - Very slow turn based gameplay - character can't run and takes forever to get places ...
Very slow paced action adventure game
false
I am a big fan of Mary Karr. I read The Liars Club when I was sixteen and have reread it since then. I also read Cherry, which I thought was less good, but still really well written. This book was a little bit dissapointing. Karr starts off well, but it is soon very obvious that she is not going to fully develop her ex-husband as a character; he is like a ghost throughout the book. She doesn't seem as brutally honest in this book, and while I can understand some of her reasoning, I think she sho...
Great Momentum, but some serious flaws
true
This was a summer book reading. Recommended by his sixth grade teacher. It was hard to put down. Very exciting and full of suspense. Highly recommmed.
Super good !!!! Best book Grandson has read so far.
true
I haven't used it yet but I'm already in love. The colors are vivid and rich. The size is a bit bigger than what I thought which is great. I can't wait to use them. I utterly recommend it.
AWESOME
true
Absolutely loved it! Love that all of them are in one book.
Absolutely Love it!
true
Any one who trashes these pans seriously does not know how to cook with stainless. And even the people who claim they do and still have trouble.. well you really don't know how to then... These pans are extremely sturdy, and beautifully built. Awesome weight to them, easy to clean, and heat up quickly and evenly. The only thing I would LOVE is a 12 inch skillet in the set, but that's in now way going to hurt my rating. Buy these pans if you want the best stainless set, for an awesome price
Beautiful Pans!
false
Stolen Innocence is written from the point-of-view of a former member of The Fundamentalist Church of Jesus Christ of Latter Day Saints (FLDS). It was a happy accident that this book was sent to me, and I found myself intrigued enough to continue after reading through the first chapter. To an outsider, the practices of the FLDS can be appalling, such as the practice of polygamy, the powerlessness of women in the society and the families torn apart by the church officials. This book takes a sensi...
Appalling Story, Gripping Read
true
Wish I had a lot of this stuff. It is hard to get here and I had to order it. The Beistle Company sent it super fast and it was exactly what I wanted and needed.. Hope to get more soon.
The Beistle Company sent it super fast and it was exactly what I wanted and ...
true
Looks identical to the image posted, cute and sturdy
cute and sturdy
false
As a fan of L'Engle I was looking forward to an Acceptable Time. All the other Murry books had been fun and interesting and I thought that an Acceptable Time would be up to their quality. Unfortuanatly it's not. At times it is interesting but overall the book as quite boring at times and fails to draw the reader in like the others have. The characters are good but seem less human than some of L'Engle's other books and are not as endearing. An Acceptable Time is worth reading if you are ...
A dissapointment
2,049 rows|Truncated data
df = df.na.drop(subset=df.columns)
# display(df)
print((df.count(), len(df.columns)))
(785105, 34)
df.printSchema()
root |-- verified: boolean (nullable = true) |-- reviewText: string (nullable = true) |-- summary: string (nullable = true) |-- label: integer (nullable = true) |-- reviewdayOfWeek: integer (nullable = true) |-- reviews_per_customer: long (nullable = false) |-- products_reviewed_per_customer: long (nullable = false) |-- reviews_per_product: long (nullable = false) |-- reviewers_per_product: long (nullable = false) |-- product_earliest_review: integer (nullable = true) |-- product_latest_review: integer (nullable = true) |-- product_review_interval: integer (nullable = true) |-- Review_Length: integer (nullable = false) |-- overall_high: integer (nullable = false) |-- positive_experience: integer (nullable = false) |-- neutral_experience: integer (nullable = false) |-- average_sentiment_score_per_customer: double (nullable = true) |-- median_sentiment_score_per_customer: float (nullable = true) |-- negative_exp_reviewer: integer (nullable = true) |-- avg_sentiment_score_per_product: double (nullable = true) |-- median_sentiment_score_per_product: float (nullable = true) |-- review_divergence: double (nullable = true) |-- days_since_review_creation: integer (nullable = true) |-- months_since_review_creation: double (nullable = true) |-- months_since_customers_first_review: double (nullable = true) |-- months_since_customers_last_review: double (nullable = true) |-- reviews_per_year_per_customer: double (nullable = true) |-- High_Volume_Timeframes: integer (nullable = false) |-- question_marks_count: integer (nullable = true) |-- exclamation_marks_count: integer (nullable = true) |-- all_caps_count: integer (nullable = true) |-- simplified_reviewText: string (nullable = false) |-- review_length_original_chars: integer (nullable = true) |-- review_length_simplified_chars: integer (nullable = false)
# Case of imbalanced classification - with a lot fewer helpful reviews
df.groupBy("label").count().show()
Cancelled
def freq_word_count(data):

    regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
    data = regexTokenizer.transform(data)

    data = data.withColumn("wordCount", size(data["words"]))

    word_counts_label0 = data.filter(col("label") == 0).select(explode(col("words")).alias("word")).groupBy("word").agg(count("*").alias("count_label0"))

    word_counts_label1 = data.filter(col("label") == 1).select(explode(col("words")).alias("word")).groupBy("word").agg(count("*").alias("count_label1"))

    word_counts = word_counts_label0.join(word_counts_label1, "word", "outer").fillna(0)

    word_counts = word_counts.withColumn("frequent_compare", col("count_label1") / col("count_label0"))
    filtered_words = word_counts.filter((col("frequent_compare") > 2) & (col("count_label1") > 10))

    freq_word_list = [row.word for row in filtered_words.select("word").collect()]
    freq_word_set = set(freq_word_list)

    def count_freq_words(words):
        return len([word for word in words if word in freq_word_set])

    count_freq_words_udf = udf(count_freq_words, IntegerType())
    data = data.withColumn("freq_word_count", count_freq_words_udf(col("words")))

    return data, filtered_words
df, filtered_words = freq_word_count(df)
display(df)
 
verified
reviewText
summary
1
2
3
4
5
6
7
8
9
10
11
12
13
1
This item does not fit across a double basin sink or normal 4 burner stove. It's way too small. It does make a nice inexpensive cutting board.
Too small...
0
I got this game for 50 bucks. I think it was worth it. I am an avid star wars fan and I think this game is great. This is not a very competitive shooter. There is almost zero recoil on the blasters and killing each other is quite easy. There are only 4 maps for the main game mode, Walker Assault mode. If you are not a Star Wars fan, I would say this game would be about a 6/10 for you. However, as a Star Wars fan myself, I am absolutely loving this game. As a star wars fan, this is my review whic...
Non-Star Wars Fans: 3/5 ------- Star Wars Fans: 4/5
1
It seems to be a good mouse but I have medium sized hands and it still feels kind of small. It feels really thin (like a apple mouse) and I guess I generally prefer a bigger more comfy mouse. That being said, it is accurate and has nice color scheme.
Kind of small
1
I really wanted to like this controller but there just isn't much of a reason to use it. There is only 1 benefit to using this over a Xbox 360 controller and that's for RTS games but even then it's terribly annoying. I would suggest just using a keyboard with built in trackpad and that would be better. It's just so.. ackward to use. Just imagine a Xbox controller that replaces one joystick with a circular trackpad from a laptop. That is exactly what this is and it just isn't good for much. For S...
Meh.. Use Xbox controller with Steam-Link
0
This game is fantastic. I can't believe I waited so long to get it. I didn't really like the other ones though. I have tried them all but for some reason I am addicted to this Fallout. I have spent at least 100+ hours so far in this game and only started playing it about 2 months ago. I beat it once as a Melee only character and then restarted as a charisma stealth player. WARNING: Do not complete the story entirely before completing missions in all 4 factions. If you DO complete it before you ...
I didn't like the other Fallout games
0
I bought this game at Target but felt like I should review on Amazon as well. Overall, I am disappointed in this game. Here is the pros and cons of this game (in my opinion). The cons could be pros for you. PROS: - Outstanding graphics - Solid 30 frames per second - Big world to explore - Character customization - Split Screen and local coop CONS: - Walk FOREVER before seeing any action - Clunky user interface - Very slow turn based gameplay - character can't run and takes forever to get places ...
Very slow paced action adventure game
0
I am a big fan of Mary Karr. I read The Liars Club when I was sixteen and have reread it since then. I also read Cherry, which I thought was less good, but still really well written. This book was a little bit dissapointing. Karr starts off well, but it is soon very obvious that she is not going to fully develop her ex-husband as a character; he is like a ghost throughout the book. She doesn't seem as brutally honest in this book, and while I can understand some of her reasoning, I think she sho...
Great Momentum, but some serious flaws
1
This was a summer book reading. Recommended by his sixth grade teacher. It was hard to put down. Very exciting and full of suspense. Highly recommmed.
Super good !!!! Best book Grandson has read so far.
1
I haven't used it yet but I'm already in love. The colors are vivid and rich. The size is a bit bigger than what I thought which is great. I can't wait to use them. I utterly recommend it.
AWESOME
1
Absolutely loved it! Love that all of them are in one book.
Absolutely Love it!
1
Any one who trashes these pans seriously does not know how to cook with stainless. And even the people who claim they do and still have trouble.. well you really don't know how to then... These pans are extremely sturdy, and beautifully built. Awesome weight to them, easy to clean, and heat up quickly and evenly. The only thing I would LOVE is a 12 inch skillet in the set, but that's in now way going to hurt my rating. Buy these pans if you want the best stainless set, for an awesome price
Beautiful Pans!
0
Stolen Innocence is written from the point-of-view of a former member of The Fundamentalist Church of Jesus Christ of Latter Day Saints (FLDS). It was a happy accident that this book was sent to me, and I found myself intrigued enough to continue after reading through the first chapter. To an outsider, the practices of the FLDS can be appalling, such as the practice of polygamy, the powerlessness of women in the society and the families torn apart by the church officials. This book takes a sensi...
Appalling Story, Gripping Read
1
Wish I had a lot of this stuff. It is hard to get here and I had to order it. The Beistle Company sent it super fast and it was exactly what I wanted and needed.. Hope to get more soon.
The Beistle Company sent it super fast and it was exactly what I wanted and ...
1,275 rows|Truncated data
# set seed for reproducibility
(trainingData, testingData) = df.randomSplit([0.8, 0.2], seed = 47)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testingData.count()))
Training Dataset Count: 627217 Test Dataset Count: 157888
feature_names = ['featuresCount',             
                                            'countVectorsSum', 
                                            'freq_word_count',
                                            'verified',
                                            'reviews_per_customer',
                                            'products_reviewed_per_customer',
                                            'reviews_per_product',
                                            'Review_Length',
                                            'overall_high',
                                            'positive_experience',
                                            'neutral_experience',
                                            'negative_exp_reviewer',
                                            'days_since_review_creation',
                                            'months_since_review_creation',
                                            'months_since_customers_first_review',
                                            'months_since_customers_last_review',
                                            'reviews_per_year_per_customer',
                                            'High_Volume_Timeframes',
                                            'reviewdayOfWeek',
                                            'reviewers_per_product',
                                            'product_earliest_review',
                                            'product_latest_review',
                                            'review_length_original_chars',
                                            'review_length_simplified_chars',
                                            'avg_sentiment_score_per_product',
                                            'review_divergence',
                                            'average_sentiment_score_per_customer',
                                            'median_sentiment_score_per_customer',
                                            'median_sentiment_score_per_product',
                                            # 'average_sentiment_score_per_product',
                                            'tfidf',
                                            'question_marks_count',
                                            'exclamation_marks_count',
                                            'all_caps_count'
                                            ]
# We'll tokenize the text using a simple RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words_1", pattern="\\W")
regexTokenizerSum = RegexTokenizer(inputCol="summary", outputCol="wordsSum", pattern="\\W")

# Remove standard Stopwords
stopwordsRemover = StopWordsRemover(inputCol="words_1", outputCol="filtered")
stopwordsRemoverSum = StopWordsRemover(inputCol="wordsSum", outputCol="filteredSum")

countVectors = CountVectorizer(inputCol="filtered", outputCol="featuresCount", vocabSize=3000, minTF=1, maxDF=0.40)

countVectorsSum = CountVectorizer(inputCol="filteredSum", outputCol="countVectorsSum", vocabSize=3000, minTF=1, maxDF=0.40)

# Generate Inverse Document Frequency weighting
# idf = IDF(inputCol="featuresCount", outputCol="idfFeatures", minDocFreq=100)

hashing_tf = HashingTF(inputCol="words_1", outputCol="raw_features_hash", numFeatures=50)
tfidf = IDF(inputCol="raw_features_hash", outputCol="tfidf")

featureAssembler = VectorAssembler(inputCols=feature_names, outputCol="features")

# ml_alg  = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.0)

pipeline = Pipeline(stages=[regexTokenizer, regexTokenizerSum,
                            stopwordsRemover, 
                            countVectors, 
                            stopwordsRemoverSum, 
                            countVectorsSum,
                            hashing_tf,
                            tfidf,
                            featureAssembler,
                            ])

# paramGrid = ParamGridBuilder() \
#     .addGrid(countVectors.minTF, [1, 3, 5, 7, 10, 25, 50]) \
#     .addGrid(countVectors.maxDF, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) \
#     .addGrid(countVectors.vocabSize, [500, 1000, 2000, 2500, 3000, 5000]) \
#     .build()
# Fit the pipeline to training documents.

pipelineFit = pipeline.fit(trainingData)
trainingDataTransformed = pipelineFit.transform(trainingData)
# display(trainingDataTransformed)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingDataTransformed)
trainingSummary = lrModel.summary

print("Training Accuracy:  " + str(trainingSummary.accuracy))
print("Training Precision: " + str(trainingSummary.precisionByLabel))
print("Training Recall:    " + str(trainingSummary.recallByLabel))
print("Training FMeasure:  " + str(trainingSummary.fMeasureByLabel()))
print("Training AUC:       " + str(trainingSummary.areaUnderROC))
Training Accuracy: 0.8540006081185417 Training Precision: [0.8646267829034158, 0.7241386567886684] Training Recall: [0.9745572215018363, 0.30444823955190203] Training FMeasure: [0.9163066590011042, 0.42867110214048987] Training AUC: 0.8707283838660484
rf_classifier = RandomForestClassifier(
    # numTrees=100,  # Number of decision trees in the forest
    # maxDepth=10,   # Maximum depth of each decision tree
    labelCol="label",
    featuresCol="features"
)

rf_model = rf_classifier.fit(trainingDataTransformed)
trainingSummary = rf_model.summary

print("Training Accuracy:  " + str(trainingSummary.accuracy))
print("Training Precision: " + str(trainingSummary.precisionByLabel))
print("Training Recall:    " + str(trainingSummary.recallByLabel))
print("Training FMeasure:  " + str(trainingSummary.fMeasureByLabel()))
print("Training AUC:       " + str(trainingSummary.areaUnderROC))
WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: canadacentral.azuredatabricks.net. Connection pool size: 10 WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: canadacentral.azuredatabricks.net. Connection pool size: 10 WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: canadacentral.azuredatabricks.net. Connection pool size: 10 WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: canadacentral.azuredatabricks.net. Connection pool size: 10
Training Accuracy: 0.834061642843384 Training Precision: [0.8393551147015151, 0.6930388101733697] Training Recall: [0.9864584708321363, 0.13936696427781367] Training FMeasure: [0.9069808325383703, 0.23206642256717033] Training AUC: 0.8408339224774074
gbt = GBTClassifier(
    featuresCol="features", 
    labelCol="label")
gbt_model = gbt.fit(trainingDataTransformed)

binary_evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
predictions = gbt_model.transform(trainingDataTransformed)

auc = binary_evaluator.evaluate(predictions)
print(auc)

# print("Training Accuracy:  " + str(trainingSummary.accuracy))
# print("Training Precision: " + str(trainingSummary.precisionByLabel))
# print("Training Recall:    " + str(trainingSummary.recallByLabel))
# print("Training FMeasure:  " + str(trainingSummary.fMeasureByLabel()))
# print("Training AUC:       " + str(trainingSummary.areaUnderROC))
0.8843714140540476
trainingSummary.roc.show()
Command skipped
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
for objective in objectiveHistory:
    print(objective)
Command skipped
testingData = testingData.fillna("None", subset=["reviewText", "summary"])
testingDataTransform = pipelineFit.transform(testingData)
# display(testingDataTransform)
NameError: name 'testingData' is not defined
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = gbt_model.transform(testingDataTransform)
predictions.show(5)
Command skipped
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print('Test Area Under ROC', evaluator.evaluate(predictions))
Command skipped
# Load in the tables
test_df = spark.sql("select * from default.reviews_test")
test_df.show(5)
print((test_df.count(), len(test_df.columns)))
+--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+ |reviewID|overall|verified| reviewTime| reviewerID| asin|reviewerName| reviewText| summary|unixReviewTime| +--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+ |80000001| 4.0| false|07 27, 2015|A1JGAP0185YJI6|0700026657| travis|I played it a whi...|But in spite of t...| 1437955200| |80000002| 5.0| true| 03 3, 2014|A1WK5I4874S3O2|0700026657| WhiteSkull|I bought this gam...|A very good game ...| 1393804800| |80000003| 5.0| true|01 12, 2013|A1YDQQJDRHM0FJ|0001713353| Leila|I am very happy w...|One of our famili...| 1357948800| |80000004| 5.0| true|11 20, 2011|A2E6AHFDJ3JBAZ|0681795107| robosolo|I purchased two o...|Insulated stainle...| 1321747200| |80000005| 5.0| false|06 28, 2011|A38NXTZUFB1O2K|0700099867| FiSH|I'm not quite fin...| Best in the series!| 1309219200| +--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+ only showing top 5 rows (348621, 10)
test_df = test_df.fillna("None", subset=['reviewText', 'summary'])
def freq_word_count_test(data):

    regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
    data = regexTokenizer.transform(data)

    data = data.withColumn("wordCount", size(data["words"]))

    freq_word_list = [row.word for row in filtered_words.select("word").collect()]
    freq_word_set = set(freq_word_list)

    def count_freq_words(words):
        return len([word for word in words if word in freq_word_set])

    count_freq_words_udf = udf(count_freq_words, IntegerType())
    data = data.withColumn("freq_word_count", count_freq_words_udf(col("words")))

    return data
test_df = date_time_extraction(test_df)
test_df = review_counts_per_customer(test_df)
test_df = review_counts_per_product(test_df)
test_df = review_level_features(test_df)
test_df = customer_level_date_time_features(test_df)
test_df = org_level_features(test_df)
test_df = detecting_non_verbal_cues(test_df)
test_df = cleaning_review_data(test_df)
test_df = freq_word_count_test(test_df)
IllegalArgumentException: Output column words already exists.
drop_list = [
    'asin', 
    # 'reviewID', 
    'reviewerID', 
    'unixReviewTime',
    'reviewTime', 
    'image', 
    'style', 
    'reviewerName', 
    'overall', 
    'stemmed_text', 
    'lowered_text', 
    'cleaned_text', 
    'words', 
    'filtered_text']

test_df = test_df.select([column for column in test_df.columns if column not in drop_list])
print((test_df.count(), len(test_df.columns)))
(348621, 34)
test_df_Transform = pipelineFit.transform(test_df)
# display(test_df_Transform)
Command skipped
test_df_Transform = test_df_Transform.fillna(0, subset = test_df_Transform.columns)
Command skipped
predictions = lrModel.transform(test_df_Transform)
Command skipped
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

probelement=udf(lambda v:float(v[1]),FloatType())
submission_data = predictions.select('reviewID', probelement('probability')).withColumnRenamed('<lambda>(probability)', 'label')

display(submission_data.select('reviewID', 'label'))
Command skipped